Data loading

library("readr")
library("igraph")
library("dplyr")
library("stringr")
library("scales")
library("textreuse")
source("R/helper.R")
source("R/section-matches.R")

Read the data.

load("cache/corpus-lsh.rda")

Spectrograms

source("R/spectrogram.R")

Spectrogram of borrowings in CA1850. I expect it will show heavily reliance on NY1849 amidst original material.

spectrogram("CA1850", best_matches, white_list = 7)
## Loading required package: ggplot2
## Selecting by n

Spectrogram of borrowings in MD1855. I expect to see reliance on English legislation (GB1852 and GB1854), amidst original material.

spectrogram("MD1855", best_matches, white_list = 7)
## Selecting by n

Spectrogram of borrowings in WA1855. An earlier version show two long runs of borrowings from Oregon and Indiana. If the latest data, still visualizes that kind of splicing, I think it’s a great illustration.

spectrogram("WA1855", best_matches, white_list = 8)
## Selecting by n

The point of this and the next exercise is not to show borrowings from all sources, but similarity between two sources. Our normal spectrogram of NV1861 will show heavy reliance on California. I want this to show that even though Nevada is two steps away from New York, it’s text is still very similar to New York throughout.

all_NV1861 <- data_frame(borrower_code = "NV1861",
                         borrower_section = all_matches %>% 
                           filter(borrower_code == "NV1861") %>% 
                           `$`("borrower_section") %>%
                           unique() %>%
                           sort()
                        )

matches_to_NY <- all_matches %>% 
  filter(borrower_code == "NV1861",
         str_detect(match_code, "NY"),
         score >= 0.1,
         match_year <= 1861) %>% 
  group_by(borrower_section) %>% 
  arrange(desc(score)) %>% 
  slice(1) %>% 
  ungroup() 

NV1861toNY <- all_NV1861 %>% left_join(matches_to_NY) %>% 
  arrange(borrower_section) 
## Joining by: c("borrower_code", "borrower_section")
spectrogram("NV1861", NV1861toNY, white_list = 5,
            title = "Sections in NV1861 with high similarity to NY codes")
## Selecting by n

Like the previous query for NV1861, Iowa is two steps away from New York. This time, however, the text is much less similar to New York. I only want one visualization for this, but I’m not sure if IA1851 or the revision IA1859 will be better, so can we do both?

IA1851

all_IA1851 <- data_frame(borrower_code = "IA1851",
                         borrower_section = all_matches %>% 
                           filter(borrower_code == "IA1851") %>% 
                           `$`("borrower_section") %>%
                           unique() %>%
                           sort()
                        )

matches_to_NY <- all_matches %>% 
  filter(borrower_code == "IA1851",
         str_detect(match_code, "NY"),
         score >= 0.1,
         match_year <= 1851) %>% 
  group_by(borrower_section) %>% 
  arrange(desc(score)) %>% 
  slice(1) %>% 
  ungroup() 

IA1851toNY <- all_IA1851 %>% left_join(matches_to_NY) %>% 
  arrange(borrower_section) 
## Joining by: c("borrower_code", "borrower_section")
spectrogram("IA1851", IA1851toNY, white_list = 10,
            title = "Sections in IA1851 with matches to NY codes")
## Selecting by n

IA1859

all_IA1859 <- data_frame(borrower_code = "IA1859",
                         borrower_section = all_matches %>% 
                           filter(borrower_code == "IA1859") %>% 
                           `$`("borrower_section") %>%
                           unique() %>%
                           sort()
                        )

matches_to_NY <- all_matches %>% 
  filter(borrower_code == "IA1859",
         str_detect(match_code, "NY"),
         score >= 0.1,
         match_year <= 1859) %>% 
  group_by(borrower_section) %>% 
  arrange(desc(score)) %>% 
  slice(1) %>% 
  ungroup() 

IA1859toNY <- all_IA1859 %>% left_join(matches_to_NY) %>% 
  arrange(borrower_section) 
## Joining by: c("borrower_code", "borrower_section")
spectrogram("IA1859", IA1859toNY, white_list = 10,
            title = "Sections in IA1859 with matches to NY codes")
## Selecting by n

Spectrogram of borrowings in NC1868. Should show heavy reliance on one or another New York Code, with scattered provisions coming from elsewhere.

spectrogram("NC1868", best_matches, white_list = 8)
## Selecting by n

Spectrogram of borrowings in CO1868. Should show heavy reliance on Illinois law.

spectrogram("CO1868", best_matches, white_list = 8)
## Selecting by n

Network graphs

Create a network graph based on section percentages.

edges_pct <- summary_matches %>% 
  filter(percent_borrowed >= 0.05,
         !is.na(match_code)) %>% 
  select(borrower_code, match_code, weight = percent_borrowed) %>% 
  group_by(borrower_code) %>% 
  top_n(2, weight)
edges_pct
## Source: local data frame [136 x 3]
## Groups: borrower_code [83]
## 
##    borrower_code match_code weight
##            (chr)      (chr)  (dbl)
## 1         AK1900     OR1862 0.5937
## 2         AR1868     KY1851 0.3634
## 3         AR1868     KY1854 0.3158
## 4         AR1874     AR1868 0.6752
## 5         AR1874     KY1851 0.0818
## 6         AZ1865     CA1851 0.5476
## 7         AZ1865     CA1858 0.2590
## 8         AZ1887     CA1872 0.4313
## 9         CA1850     NY1849 0.2972
## 10        CA1850     NY1850 0.1207
## ..           ...        ...    ...
g <- graph_from_data_frame(edges_pct, directed = TRUE) 
nodes <- distances(g, to = "NY1850", algorithm = "unweighted") %>% as.data.frame() %>% 
  add_rownames() %>% 
  rename(name = rowname, distance = NY1850) %>% 
  mutate(color = ifelse(distance == 0, "red",
                        ifelse(distance == 1, "green",
                               ifelse(distance == 2, "yellow", "lightblue"))))
## Warning in distances(g, to = "NY1850", algorithm = "unweighted"):
## Unweighted algorithm chosen, weights ignored
nodes[nodes$name == "NY1848", "color"] <- "red"
nodes[nodes$name == "NY1849", "color"] <- "red"
nodes[nodes$name == "NY1850", "color"] <- "red"
nodes[nodes$name == "NY1851", "color"] <- "red"
g <- graph_from_data_frame(edges_pct, directed = TRUE, vertices = nodes) 
V(g)$year <- V(g)$name %>% extract_date()
set.seed(4221)

g <- add_layout_(g, with_graphopt(niter = 4000, spring.length = 25), normalize())

plot_before_year <- function(x, year) {
  x_before <- induced.subgraph(x, which(V(x)$year <= year))
  n <- V(x)$name
  n_before <- V(x_before)$name
  filter <- n %in% n_before
  x_before$layout <- x_before$layout[filter, ]
  par(mar = c(0,0,1,0))
  plot(x_before, edge.width = E(x_before)$weight * 8,
       edge.arrow.size = 0.0, vertex.size = 5)
  title(paste0("Codes of Civil Procedure before ", year))
} 

for (i in seq(1850, 1900, 5)) {
  plot_before_year(g, i)
}

Create a graph based on numbers (not percentages) of sections shared. Notice that we are keeping only code to code matches that share a certain number of sections (minimum_n), we are keeping only a certain number of matches for each code (top_matches), and we are omitting codes that aren’t part of the main network.

minimum_n <- 20
top_matches <- 2
codes_not_to_plot <- c(
  # "CO868", 
  # "CT1879", 
  # "FL1847", 
  # "FL1892", 
  "GA1851",
  "GA1860", 
  "HI1859", 
  "HI1897",
  # "IL1866", 
  "LA1825", 
  "LA1844" 
#   "MS1848", 
#   "MS1857", 
  # "NY1876", 
  # "NY1879",
  # "VA1860", 
  # "VA1893", 
  # "WV1868" 
  )
# codes_not_to_plot <- NULL

edges_n <- summary_matches %>% 
  filter(!is.na(match_code),
         sections_borrowed >= minimum_n) %>%  
  select(borrower_code, match_code, weight = sections_borrowed) %>% 
  group_by(borrower_code) %>% 
  top_n(top_matches, weight) %>% 
  ungroup() %>% 
  mutate(weight = rescale(weight)) %>% 
  filter(!borrower_code %in% codes_not_to_plot,
         !match_code %in% codes_not_to_plot)
edges_n
## Source: local data frame [146 x 3]
## 
##    borrower_code match_code      weight
##            (chr)      (chr)       (dbl)
## 1         AK1900     OR1862 0.398192771
## 2         AK1900     NY1850 0.002409639
## 3         AK1900     OR1854 0.002409639
## 4         AR1868     KY1851 0.209036145
## 5         AR1868     KY1854 0.180120482
## 6         AR1874     AR1868 0.146987952
## 7         AR1874     KY1851 0.007228916
## 8         AZ1865     CA1851 0.199397590
## 9         AZ1865     CA1858 0.087951807
## 10        AZ1887     CA1872 0.203614458
## ..           ...        ...         ...
g_n <- graph_from_data_frame(edges_n, directed = TRUE) 
node_distances <- distances(g_n, to = c("NY1848", "NY1849", "NY1850", "NY1851"),
                     algorithm = "unweighted") %>% 
                     apply(1, min, na.rm = TRUE)
## Warning in distances(g_n, to = c("NY1848", "NY1849", "NY1850", "NY1851"), :
## Unweighted algorithm chosen, weights ignored
nodes_n <- data_frame(name = names(node_distances), distance = node_distances) %>% 
  mutate(color = ifelse(distance == 0, "red",
                        ifelse(distance == 1, "green",
                               ifelse(distance == 2, "yellow", "lightblue"))))

g_n <- graph_from_data_frame(edges_n, directed = TRUE, vertices = nodes_n) 
V(g_n)$year <- V(g_n)$name %>% extract_date()

edge_size_clamp <- function(g, multiplier = 20, max_val = 6, min_val = 1) {
  w <- E(g)$weight * multiplier
  w[w > max_val] <- max_val
  w[w < min_val] <- min_val
  w
}

set.seed(4221)
g_n <- g_n %>% add_layout_(with_graphopt(niter = 4000, spring.length = 25),
                           normalize())
par(mar = c(0,0,1,0))
plot(g_n, edge.width = edge_size_clamp(g_n), edge.arrow.size = 0, vertex.size = 5)
title("Borrowings between codes, number of sections borrowed")

Now do a state to state network:

min_state_borrowings <- 100
top_matches <- 2
edges_states <- summary_matches %>% 
  mutate(borrower_date = extract_date(borrower_code),
         match_date = extract_date(match_code),
         borrower_state = extract_state(borrower_code),
         match_state = extract_state(match_code)) %>% 
  filter(!is.na(match_code),
         borrower_date >= match_date,
         borrower_state != match_state) %>% 
  group_by(borrower_state, match_state) %>% 
  summarize(n = sum(sections_borrowed)) %>% 
  filter(n >= min_state_borrowings) %>% 
  select(borrower_state, match_state, weight = n) %>% 
  group_by(borrower_state) %>% 
  top_n(top_matches, weight) %>% 
  ungroup() %>% 
  mutate(weight = rescale(weight))

edges_states
## Source: local data frame [44 x 3]
## 
##    borrower_state match_state     weight
##             (chr)       (chr)      (dbl)
## 1              AK          OR 0.57655039
## 2              AR          KY 0.60562016
## 3              AZ          CA 0.75484496
## 4              CA          NY 0.69670543
## 5              CO          CA 0.05910853
## 6              CO          IL 0.04166667
## 7              DC          IN 0.11821705
## 8              DT          ND 0.22674419
## 9              DT          NE 0.19476744
## 10             FL          NY 0.03488372
## ..            ...         ...        ...
g_states <- graph_from_data_frame(edges_states, directed = TRUE)

state_distances <- distances(g_states, to = "NY", algorithm = "unweighted") 
## Warning in distances(g_states, to = "NY", algorithm = "unweighted"):
## Unweighted algorithm chosen, weights ignored
nodes_states <- data_frame(name = rownames(state_distances),
                           distance = state_distances[, 1]) %>% 
  mutate(color = ifelse(distance == 0, "red",
                        ifelse(distance == 1, "green",
                               ifelse(distance == 2, "yellow", "lightblue"))))

g_states <- graph_from_data_frame(edges_states, directed = TRUE,
                                  vertices = nodes_states) %>% 
  decompose(min.vertices = 3) %>% 
  `[[`(1)

set.seed(4221)
g_states <- g_states %>% add_layout_(with_graphopt(niter = 4000,
                                                   spring.length = 25),
                                     normalize())
par(mar = c(0,0,1,0))
plot(g_states, 
     edge.width = edge_size_clamp(g_n), edge.arrow.size = 0.5,
     edge.arrow.mode = 1,
     vertex.size = 5, vertex.label.dist = 0.85, vertex.label.degree = pi)
title("Borrowings between states, number of sections borrowed")